# An XML Assembler for MILA Hebrew Dictionary Entries (2013)
# Mitts, J. & Mitts, T. 
##--##--##--##--##--##--##--##--##--##--##--##--##--##--##--##

# This code uses the output of the MILA Morphological Analyzer developed by Itai and Wintner (2008) and selects stems based on the preference rule described in Mitts (2017).  

rm(list=ls())

library(tm)
library(XML)
library(plyr)

setwd("~/Downloads/Replication/")

# Hebrew stop words
milot.kishur <- readLines("Data/hebrew-milot-kishur.txt")

# Function to create 2-grams
BigramTokenizer <- function(x) {
  unigrams <- unlist(strsplit(x, split = " "))
  unigrams <- unigrams[unigrams!=""]
  if (length(unigrams) > 0) {
  bigrams <- rep(NA, length(unigrams)-1)
  for (i in 1:length(bigrams)){
    bigrams[i] <- paste(unigrams[i], unigrams[i+1], sep=" ")
  }
  return(bigrams)
 } else {
return("")
 }
}


# Direct the program below to the folder containing the XML output from MILA
files <- list.files("Code/Hebrew_stemming/morph_output", full.names=TRUE)

for (iFile in 1:length(files))
{
	file <- files[iFile]
	print(paste("Opening and processing file #", iFile, "of", length(files), ",", file))
	
	# This folder contains the destination stemmed text files
	newFile <- paste("Code/Hebrew_stemming/stemmed_output/", gsub(".xml", ".txt", basename(file)), sep="")
	if (!file.exists(newFile)) {
	
		d <- xmlParseDoc(file)
		topNode <- xmlRoot(d)
		
		sentences <- getNodeSet(d, "//sentence")
		sentenceTexts <- rep(NA, length(sentences))
		print(paste("Looping through", length(sentences), "sentences in file..."))
		for (i in 1:length(sentences)) 
		{	
			sentence <- sentences[[i]]
			sentenceText <- ""
	
			# These are the tokens
			sentenceTokens <- xmlChildren(sentence)
			print(paste("Sentence", i, "- tokens: ", length(sentenceTokens)))
			for (sentenceToken in sentenceTokens)
			{
				correctLexiconItem <- NULL
				correctBase <- NULL
				
				properNames <- xmlElementsByTagName(sentenceToken, "properName", recursive=TRUE)
				nouns <- xmlElementsByTagName(sentenceToken, "noun", recursive=TRUE)
				participles <- xmlElementsByTagName(sentenceToken, "participle", recursive=TRUE)
				verbs <- xmlElementsByTagName(sentenceToken, "verb", recursive=TRUE)			
				adjectives <- xmlElementsByTagName(sentenceToken, "adjectives", recursive=TRUE)	
				
				if (length(properNames) > 0) {
					correctBase <- xmlParent(properNames[[1]])
				} else {
					if (length(nouns) > 0) {
						correctBase <- xmlParent(nouns[[1]])
					} else {
						if (length(adjectives) > 0) {
							correctBase <- xmlParent(adjectives[[1]])
						} else {
							if (length(participles) > 0) {
								correctBase <- xmlParent(participles[[1]])
							} else {
								if (length(verbs) > 0) {
									correctBase <- xmlParent(verbs[[1]])
								}
							}
						}
					}
				}
				
				# Make sure we found a base
				if (!is.null(correctBase)) {
					correctLexiconItem <- xmlGetAttr(correctBase, "lexiconItem")	
					
					# Paste automatically inserts space between the two
					sentenceText <- paste(sentenceText, correctLexiconItem)			
				}
				
				rm(correctLexiconItem, correctBase, properNames, nouns, participles, verbs, adjectives)
				
			}
			
			sentenceTexts[i] <- sentenceText
	
			rm(sentence, sentenceText, sentenceTokens)
		}
	

		sentenceTexts_clean = gsub("\"", "",gsub("\n", "", paste(sentenceTexts, collapse="\n")))
		con <- file(newFile, "w")
		writeLines(sentenceTexts_clean, con)
		close(con)
		
		rm(sentenceTexts, sentences, topNode, d)
	}
}

# Generate the document-term-matrix

stemmed_files <- list.files("Code/Hebrew_stemming/stemmed_output", full.names=TRUE)


raw_text = vector(length=length(stemmed_files))

for (i in 1:length(files)){
	raw_text[i] = paste(readLines(stemmed_files[i]), collapse=" ")
	raw_text[i] = gsub('[[:punct:]]','', raw_text[i])
	raw_text[i] = gsub("\n", " ", raw_text[i])
	}
corpus <- Corpus(VectorSource(raw_text), readerControl=list(language="he"))

# This function takes time to run
dtm = data.frame()
for (j in 1:length(corpus)) {
	dtm = rbind.fill(dtm, as.data.frame(t(as.matrix(termFreq(corpus[[j]]$content, control=list(tokenize=BigramTokenizer))))))
}

# Trim the document-term-matrix to exclude stop words
dtm.trim <- dtm[,sapply(X=colnames(dtm), FUN=function(x) { all(sapply(strsplit(x, split=" "), FUN=function(y) { sapply(y, FUN=function(z) { nchar(z) > 2  && !(z %in% c(milot.kishur)) }) } )) })]


